Objectives

  • Define elementary perceptual tasks
  • Review hierarchy of perceptual tasks and how this was developed
  • Design experiments to test theories of percepual tasks and Tufte’s data-ink ratio
library(tidyverse)
library(ggthemes)
library(knitr)
library(broom)
library(stringr)
library(plotly)

options(digits = 3)
set.seed(1234)
theme_set(theme_minimal())
theme_void2 <- function (base_size = 11, base_family = "") 
{
  theme(line = element_blank(),
        rect = element_blank(),
        text = element_text(family = base_family, 
                            face = "plain", colour = "black", size = base_size,
                            lineheight = 0.9, hjust = 0.5, vjust = 0.5, angle = 0,
                            margin = margin(), debug = FALSE),
        axis.text.y = element_blank(),
        axis.title = element_blank(), 
        legend.text = element_text(size = rel(0.8)),
        legend.title = element_text(hjust = 0), 
        strip.text = element_text(size = rel(0.8)),
        plot.margin = unit(c(0, 0, 0, 0), "lines"), complete = TRUE)
}

Cleveland and McGill

  • What makes a graph more accurate?

A graphical form that involves elementary perceptual tasks that lead to more accurate judgments than another graphical form (with the same quantitiative information) will result in better organization and increase the chances of a correct perception of patterns and behavior.

Figure 6.12 from The Functional Art

Figure 6.12 from The Functional Art

Basic perceptual tasks on graphs

Bar chart

diamonds_sum <- diamonds %>%
  group_by(cut) %>%
  summarize(n = n()) %>%
  ungroup %>%
  mutate(pct = n / sum(n))

ggplot(diamonds_sum, aes(cut, pct)) +
  geom_col() +
  scale_y_continuous(labels = scales::percent) +
  labs(title = "Diamonds data",
       x = "Cut of diamond",
       y = "Percentage of sample")

Pie chart

ggplot(diamonds_sum, aes(x = factor(1), y = pct, fill = cut)) +
  geom_col(width = 1) +
  coord_polar(theta = "y", direction = -1) +
  theme_void()

Statistical maps with color

library(fiftystater)

data("fifty_states") # this line is optional due to lazy data loading

crimes <- data.frame(state = tolower(rownames(USArrests)), USArrests)

# map_id creates the aesthetic mapping to the state name column in your data
p <- ggplot(crimes, aes(map_id = state)) + 
  # map points to the fifty_states shape data
  geom_map(aes(fill = Assault), map = fifty_states) + 
  expand_limits(x = fifty_states$long, y = fifty_states$lat) +
  coord_map() +
  scale_x_continuous(breaks = NULL) + 
  scale_y_continuous(breaks = NULL) +
  labs(x = "", y = "") +
  theme(legend.position = "bottom", 
        panel.background = element_blank()) +
  fifty_states_inset_boxes()

# default colors
p

Scatterplot

# generate correlated data
xy <- ecodist::corgen(len = 20, r = .95) %>%
  bind_cols

ggplot(xy, aes(x, y)) +
  geom_point()

xy %>%
  mutate(id = row_number()) %>%
  gather(var, value, -id) %>%
  ggplot(aes(id, value, fill = var)) +
  geom_col(position = "dodge")

Curvature

polls <- read_csv("data/538.csv")
## Parsed with column specification:
## cols(
##   Dates = col_character(),
##   Pollster = col_character(),
##   Grade = col_character(),
##   Sample = col_number(),
##   Weight = col_double(),
##   Approve = col_character(),
##   Disapprove = col_character(),
##   Approve.adj = col_character(),
##   Disapprove.adj = col_character()
## )
polls_data <- polls %>%
  separate("Dates", into = c("Date", "etc"), sep = "-") %>%
  select(-etc) %>%
  mutate(Date = str_c(Date, ", 2017")) %>%
  mutate(Date = lubridate::mdy(Date)) %>%
  mutate_at(vars(Approve:Disapprove.adj), parse_number) %>%
  select(Date, Approve.adj, Disapprove.adj) %>%
  mutate_at(vars(Approve.adj, Disapprove.adj), funs(. / 100))


polls_data %>%
  gather(var, value, -Date) %>%
  mutate(var = factor(var, levels = c("Approve.adj", "Disapprove.adj"),
                      labels = c("Approve", "Disapprove"))) %>%
  ggplot(aes(Date, value, color = var)) +
  geom_point(alpha = .2) +
  geom_smooth() +
  scale_y_continuous(label = scales::percent) +
  labs(title = "How popular/unpopular is Donald Trump?",
       y = NULL,
       color = NULL,
       caption = "Source: FiveThirtyEight") +
  theme(legend.position = "bottom")
## `geom_smooth()` using method = 'loess'

polls_data %>%
  mutate(diff = Approve.adj - Disapprove.adj) %>%
  ggplot(aes(Date, diff)) +
  geom_point(alpha = .2) +
  geom_smooth() +
  scale_y_continuous(label = scales::percent) +
  labs(title = "How popular/unpopular is Donald Trump?",
       y = NULL,
       caption = "Source: FiveThirtyEight")
## `geom_smooth()` using method = 'loess'

Experimental design

  • \(N = 55\)
  • Used bar charts and pie charts
  • Experiment 1 - asked to make assessments of length and position along a common scale
  • Experiment 2 - asked to make judgments of position and angle (pie vs bar chart)
  • Make visual assessments of what percentage one value was of a larger value

Cairo - applying perceptual task scale

A proportional symbol map. Source: The Functional Art

A proportional symbol map. Source: The Functional Art

Chloropeth map. Source: The Functional Art

Chloropeth map. Source: The Functional Art

Dot chart. Source: The Functional Art

Dot chart. Source: The Functional Art

Scatterplot. Source: The Functional Art

Scatterplot. Source: The Functional Art

Slopegraph. Source: The Functional Art

Slopegraph. Source: The Functional Art

What we learn

  • Use theory and experiments to develop rules for data visualization (not Tufte’s assertion approach)
  • Different perceptual tasks (i.e. different channels) are easier or harder to decode and accurately interpret
  • Pick the highest-level encoding task possible given the data structure/purpose of the visualization

Crowdsourcing Graphical Perception

Amazon Mechanical Turk

Transparency of gridlines

  • Recall that Tufte hates gridlines
  • Experiment 2 crowdsources the optimal transparency for the gridlines and data points in a scatterplot
ggplot(mpg, aes(cty, hwy)) +
  geom_point()

ggplot(mpg, aes(cty, hwy)) +
  geom_point() +
  theme(panel.grid.major = element_line("grey70"),
        panel.grid.minor = element_line("grey70"))

ggplot(mpg, aes(cty, hwy)) +
  geom_point() +
  theme(panel.grid.major = element_line("grey50"),
        panel.grid.minor = element_line("grey50"))

ggplot(mpg, aes(cty, hwy)) +
  geom_point() +
  theme(panel.grid.major = element_line("grey20"),
        panel.grid.minor = element_line("grey20"))

  • Individuals prefer lighter (more transparent backgrounds)
  • Could be run using a Shiny app integrated into an MTurk experiment

In defense of pie charts

  • Which is easier to distinguish: line length or area, angle, and arc length?
  • Psychophysical theory of perception. Function of relationship between perceived areas of circles and their physical areas

    \[\text{Subjective area} = \text{Area}^.86\]

    • People misjudge/misestimate area
    • This function is not derived from rigorous science
    • Focus more on the subarea (area/size of individual slices rather than the entire circle)
    • People also do not generally use pie charts to estimate precise magnitudes (if so, use a table instead)
pie <- data_frame(label = c("A", "B", "C", "D"),
                  value = c(10, 20, 40, 30))

# bar chart
ggplot(pie, aes(label, value)) +
  geom_col(color = "black", fill = "white") +
  theme_void2()

# proportional area chart
pie_prop <- pie %>%
  mutate(ymin = 0,
         ymax = 1,
         xmax = cumsum(value),
         xmin = xmax - value,
         x = (xmax - xmin) / 2 + xmin)

ggplot(pie_prop, aes(xmin = xmin, xmax = xmax,
             ymin = ymin, ymax = ymax)) +
  geom_rect(fill = "white", color = "black") +
  scale_x_continuous(breaks = pie_prop$x, labels = pie_prop$label) +
  theme_void2()

# pie chart
ggplot(pie, aes(x = factor(1), y = value)) +
  geom_col(width = 1, color = "black", fill = "white") +
  geom_text(aes(label = label), position = position_stack(vjust = .5), size = 4) +
  coord_polar(theta = "y", direction = 1) +
  theme_void() +
  theme(legend.position = "none")

# table
pie %>%
  select(value) %>%
  t %>%
  knitr::kable(col.names = pie$label,
               row.names = FALSE)
A B C D
10 20 40 30

Experiment 1

  • Compare speed at processing and making comparisons between bar charts and pie charts
  • No substantial difference in processing time or accuracy of interpretations - see Figures 2 and 3

Experiment 2

  • Ordering of the bars/slices didn’t really effect interpretation speed or accuracy - ordering apparently is not a major perceptual strategy

Experiment 3

  • Pie charts better for A + B vs. C + D comparisons
  • Hypothesis: in pie charts, adjacent slices were easier to visually combine together
  • In truth, this didn’t matter - but for bar charts, adjacency actually helped

Takeaways

  • For pure magnitude identification, bar charts are superior
  • For comparing percentages, either chart is acceptable
  • To compare combinations of groupings, pie charts are slightly superior
  • Tables are only useful if you want to report exact percentages

Design a data visualization experiment

  • Work with someone next to you (or a group of three)
  • Develop a research question applicable to visual design that can be answered experimentally
    • Replicate something from Cleveland and McGill
    • Extend to additional question
    • Tufte minimalism
  • How will you design two (or more) graphs to obtain an accurate and unbiased estimate?

Session Info

devtools::session_info()
## Session info --------------------------------------------------------------
##  setting  value                       
##  version  R version 3.3.3 (2017-03-06)
##  system   x86_64, darwin13.4.0        
##  ui       X11                         
##  language (EN)                        
##  collate  en_US.UTF-8                 
##  tz       America/Chicago             
##  date     2017-04-17
## Packages ------------------------------------------------------------------
##  package     * version  date       source         
##  assertthat    0.1      2013-12-06 CRAN (R 3.3.0) 
##  backports     1.0.5    2017-01-18 CRAN (R 3.3.2) 
##  base64enc     0.1-3    2015-07-28 CRAN (R 3.3.0) 
##  broom       * 0.4.2    2017-02-13 CRAN (R 3.3.2) 
##  colorspace    1.3-2    2016-12-14 CRAN (R 3.3.2) 
##  DBI           0.6      2017-03-09 CRAN (R 3.3.3) 
##  devtools      1.12.0   2016-06-24 CRAN (R 3.3.0) 
##  digest        0.6.12   2017-01-27 CRAN (R 3.3.2) 
##  dplyr       * 0.5.0    2016-06-24 CRAN (R 3.3.0) 
##  evaluate      0.10     2016-10-11 CRAN (R 3.3.0) 
##  forcats       0.2.0    2017-01-23 CRAN (R 3.3.2) 
##  foreign       0.8-67   2016-09-13 CRAN (R 3.3.3) 
##  ggplot2     * 2.2.1    2016-12-30 CRAN (R 3.3.2) 
##  ggthemes    * 3.4.0    2017-02-19 CRAN (R 3.3.2) 
##  gtable        0.2.0    2016-02-26 CRAN (R 3.3.0) 
##  haven         1.0.0    2016-09-23 cran (@1.0.0)  
##  hms           0.3      2016-11-22 CRAN (R 3.3.2) 
##  htmltools     0.3.5    2016-03-21 CRAN (R 3.3.0) 
##  htmlwidgets   0.8      2016-11-09 CRAN (R 3.3.1) 
##  httr          1.2.1    2016-07-03 CRAN (R 3.3.0) 
##  jsonlite      1.3      2017-02-28 CRAN (R 3.3.2) 
##  knitr       * 1.15.1   2016-11-22 cran (@1.15.1) 
##  lattice       0.20-34  2016-09-06 CRAN (R 3.3.3) 
##  lazyeval      0.2.0    2016-06-12 CRAN (R 3.3.0) 
##  lubridate     1.6.0    2016-09-13 CRAN (R 3.3.0) 
##  magrittr      1.5      2014-11-22 CRAN (R 3.3.0) 
##  memoise       1.0.0    2016-01-29 CRAN (R 3.3.0) 
##  mnormt        1.5-5    2016-10-15 CRAN (R 3.3.0) 
##  modelr        0.1.0    2016-08-31 CRAN (R 3.3.0) 
##  munsell       0.4.3    2016-02-13 CRAN (R 3.3.0) 
##  nlme          3.1-131  2017-02-06 CRAN (R 3.3.3) 
##  plotly      * 4.5.6    2016-11-12 CRAN (R 3.3.2) 
##  plyr          1.8.4    2016-06-08 CRAN (R 3.3.0) 
##  psych         1.7.3.21 2017-03-22 CRAN (R 3.3.2) 
##  purrr       * 0.2.2    2016-06-18 CRAN (R 3.3.0) 
##  R6            2.2.0    2016-10-05 CRAN (R 3.3.0) 
##  Rcpp          0.12.10  2017-03-19 cran (@0.12.10)
##  readr       * 1.1.0    2017-03-22 cran (@1.1.0)  
##  readxl        0.1.1    2016-03-28 CRAN (R 3.3.0) 
##  reshape2      1.4.2    2016-10-22 CRAN (R 3.3.0) 
##  rmarkdown     1.3      2016-12-21 CRAN (R 3.3.2) 
##  rprojroot     1.2      2017-01-16 CRAN (R 3.3.2) 
##  rvest         0.3.2    2016-06-17 CRAN (R 3.3.0) 
##  scales        0.4.1    2016-11-09 CRAN (R 3.3.1) 
##  stringi       1.1.2    2016-10-01 CRAN (R 3.3.0) 
##  stringr     * 1.2.0    2017-02-18 CRAN (R 3.3.2) 
##  tibble      * 1.2      2016-08-26 cran (@1.2)    
##  tidyr       * 0.6.1    2017-01-10 CRAN (R 3.3.2) 
##  tidyverse   * 1.1.1    2017-01-27 CRAN (R 3.3.2) 
##  viridisLite   0.1.3    2016-03-12 cran (@0.1.3)  
##  withr         1.0.2    2016-06-20 CRAN (R 3.3.0) 
##  xml2          1.1.1    2017-01-24 CRAN (R 3.3.2) 
##  yaml          2.1.14   2016-11-12 cran (@2.1.14)